앞서 작업한 데이터와 선형회귀모형과 ARIMA 모형을 저장한 data/corona_fcst_list.rds 리스트 객체를 단변량 시계열 분석 작업을 위해서 풀어둔다.
library(tidyverse)
library(tidymodels)
library(timetk)
library(modeltime)
# 데이터 + 모형 ----
corona_fcst_list <- read_rds("data/corona_fcst_list.rds")
# 데이터 ----
full_tbl <- corona_fcst_list$data
# 모형 ----
wkfl_fit_lm <- corona_fcst_list$model$wkfl_fit_lm
wkfl_fit_arima <- corona_fcst_list$model$wkfl_fit_arimalog1p() 함수는 base 팩키지에 포함된 함수로 로그값에 1을 더해 Inf가 발생되는 것을 막아준다. 원척도로 되돌리려면 expm1() 함수를 사용하면 된다.
library(lubridate)
full_tbl %>%
filter(!is.na(확진자)) %>%
plot_time_series_regression(.date_var = 날짜,
.formula = 확진자 ~ as.numeric(날짜) +
wday(날짜, label = TRUE) +
month(날짜, label = TRUE),
.show_summary = TRUE,
.interactive = FALSE)
Call:
stats::lm(formula = .formula, data = .data)
Residuals:
Min 1Q Median 3Q Max
-294.38 -68.36 -5.55 47.58 684.51
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.353e+04 1.396e+04 -4.551 7.59e-06 ***
as.numeric(날짜) 3.453e+00 7.568e-01 4.563 7.19e-06 ***
wday(날짜, label = TRUE).L 4.078e+01 1.720e+01 2.371 0.018320 *
wday(날짜, label = TRUE).Q -6.671e+00 1.721e+01 -0.388 0.698524
wday(날짜, label = TRUE).C -7.525e+00 1.718e+01 -0.438 0.661756
wday(날짜, label = TRUE)^4 6.834e+00 1.719e+01 0.397 0.691265
wday(날짜, label = TRUE)^5 -1.547e+00 1.716e+01 -0.090 0.928245
wday(날짜, label = TRUE)^6 5.877e+00 1.720e+01 0.342 0.732816
month(날짜, label = TRUE).L -7.821e+02 2.727e+02 -2.868 0.004408 **
month(날짜, label = TRUE).Q 3.880e+02 2.805e+01 13.831 < 2e-16 ***
month(날짜, label = TRUE).C 3.720e+02 2.737e+01 13.591 < 2e-16 ***
month(날짜, label = TRUE)^4 1.123e+02 2.528e+01 4.444 1.22e-05 ***
month(날짜, label = TRUE)^5 2.634e+02 2.372e+01 11.102 < 2e-16 ***
month(날짜, label = TRUE)^6 5.332e+01 2.260e+01 2.359 0.018902 *
month(날짜, label = TRUE)^7 -7.498e+01 2.190e+01 -3.423 0.000699 ***
month(날짜, label = TRUE)^8 1.138e+01 2.166e+01 0.525 0.599641
month(날짜, label = TRUE)^9 -6.653e+01 2.171e+01 -3.065 0.002365 **
month(날짜, label = TRUE)^10 6.282e+01 2.163e+01 2.904 0.003936 **
month(날짜, label = TRUE)^11 3.692e+01 2.161e+01 1.708 0.088541 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 119.5 on 320 degrees of freedom
Multiple R-squared: 0.7749, Adjusted R-squared: 0.7623
F-statistic: 61.21 on 18 and 320 DF, p-value: < 2.2e-16
full_tbl %>%
filter(!is.na(확진자)) %>%
plot_time_series_regression(.date_var = 날짜,
.formula = base::log1p(확진자) ~ as.numeric(날짜) +
wday(날짜, label = TRUE) +
month(날짜, label = TRUE),
.show_summary = TRUE,
.interactive = FALSE)
Call:
stats::lm(formula = .formula, data = .data)
Residuals:
Min 1Q Median 3Q Max
-4.8288 -0.4155 -0.0077 0.3892 4.0164
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.309e+02 1.112e+02 -3.874 0.000130 ***
as.numeric(날짜) 2.358e-02 6.031e-03 3.909 0.000113 ***
wday(날짜, label = TRUE).L 2.986e-01 1.371e-01 2.178 0.030105 *
wday(날짜, label = TRUE).Q -4.420e-02 1.371e-01 -0.322 0.747447
wday(날짜, label = TRUE).C -1.719e-01 1.370e-01 -1.255 0.210439
wday(날짜, label = TRUE)^4 6.970e-02 1.370e-01 0.509 0.611323
wday(날짜, label = TRUE)^5 -6.183e-02 1.368e-01 -0.452 0.651572
wday(날짜, label = TRUE)^6 5.702e-02 1.371e-01 0.416 0.677688
month(날짜, label = TRUE).L -3.956e+00 2.174e+00 -1.820 0.069677 .
month(날짜, label = TRUE).Q -4.242e-01 2.236e-01 -1.897 0.058673 .
month(날짜, label = TRUE).C 2.020e+00 2.182e-01 9.261 < 2e-16 ***
month(날짜, label = TRUE)^4 -9.457e-01 2.015e-01 -4.693 3.99e-06 ***
month(날짜, label = TRUE)^5 1.380e+00 1.891e-01 7.298 2.32e-12 ***
month(날짜, label = TRUE)^6 2.554e-01 1.801e-01 1.418 0.157204
month(날짜, label = TRUE)^7 -1.195e+00 1.746e-01 -6.847 3.86e-11 ***
month(날짜, label = TRUE)^8 6.382e-01 1.726e-01 3.698 0.000256 ***
month(날짜, label = TRUE)^9 -8.868e-01 1.730e-01 -5.126 5.14e-07 ***
month(날짜, label = TRUE)^10 2.800e-01 1.724e-01 1.624 0.105352
month(날짜, label = TRUE)^11 3.528e-01 1.722e-01 2.049 0.041283 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9522 on 320 degrees of freedom
Multiple R-squared: 0.6698, Adjusted R-squared: 0.6513
F-statistic: 36.07 on 18 and 320 DF, p-value: < 2.2e-16
예측할 데이터와 모형개발에 활용할 데이터로 나눈 후에 모형개발에 활용할 데이터를 훈련/시험 데이터로 나눈다.
## 예측 데이터와 모형개발 데이터로 분리
forecast_tbl <- full_tbl %>%
filter(is.na(확진자))
history_tbl <- full_tbl %>%
filter(!is.na(확진자)) %>%
mutate(확진자 = log1p(확진자))
## 훈련/시험 데이터 분할
splits <- history_tbl %>%
time_series_split(date_var = 날짜,
assess = 30,
cumulative = TRUE)
splits<Analysis/Assess/Total>
<309/30/339>
훈련/시험 데이터로 잘 나눠졌는지 시각적으로 확인한다.
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(.date_var = 날짜,
.value = 확진자)tidymodels 생태계의 recipe 팩키지 recipe() 함수를 사용해서 피쳐 공학(feature engineering) 작업을 수행한다. 추후, 다양한 피처를 작업할 예정이라 대략적인 틀만 잡아둔다.
recipe_spec <- recipes::recipe(확진자 ~ 날짜, data = training(splits))
recipe_spec %>% prep() %>% juice()# A tibble: 309 x 2
날짜 확진자
<date> <dbl>
1 2020-01-23 0
2 2020-01-24 0.693
3 2020-01-25 0
4 2020-01-26 0.693
5 2020-01-27 0.693
6 2020-01-28 0
7 2020-01-29 0
8 2020-01-30 0
9 2020-01-31 2.08
10 2020-02-01 0.693
# ... with 299 more rows
단변량 예측 알고리즘으로 다음 세가지 대표적인 알고리즘을 적용시킨다.
대표적인 시계열 데이터 예측모형 ARIMA로 적합시킨다.
mdl_arima_spec <- arima_reg() %>%
set_engine("auto_arima")
wkfl_fit_arima <- workflows::workflow() %>%
add_recipe(recipe_spec) %>%
add_model(mdl_arima_spec) %>%
fit(training(splits))
wkfl_fit_arima== Workflow [trained] ==========================================================
Preprocessor: Recipe
Model: arima_reg()
-- Preprocessor ----------------------------------------------------------------
0 Recipe Steps
-- Model -----------------------------------------------------------------------
Series: outcome
ARIMA(2,1,0)(2,0,0)[7]
Coefficients:
ar1 ar2 sar1 sar2
-0.3797 -0.1385 0.0643 0.0514
s.e. 0.0565 0.0567 0.0568 0.0586
sigma^2 estimated as 0.3447: log likelihood=-271.11
AIC=552.22 AICc=552.42 BIC=570.88
지수평활(exponential smoothing) 중 ETS(Error-Trend-Season)를 예측모형으로 적합시킨다.
mdl_ets_spec <- exp_smoothing(
error = "auto",
trend = "auto",
season = "auto",
damping = "auto"
) %>%
set_engine("ets")
wkfl_fit_ets <- workflows::workflow() %>%
add_recipe(recipe_spec) %>%
add_model(mdl_ets_spec) %>%
fit(training(splits))
wkfl_fit_ets== Workflow [trained] ==========================================================
Preprocessor: Recipe
Model: exp_smoothing()
-- Preprocessor ----------------------------------------------------------------
0 Recipe Steps
-- Model -----------------------------------------------------------------------
ETS(A,N,N)
Call:
forecast::ets(y = outcome, model = model_ets, damped = damping_ets)
Smoothing parameters:
alpha = 0.5998
Initial states:
l = 0.2034
sigma: 0.5845
AIC AICc BIC
1443.701 1443.780 1454.901
TBATS를 예측모형으로 적합시킨다.
mdl_tbats_spec <- seasonal_reg(
mode = "regression",
seasonal_period_1 = "auto"
) %>%
set_engine("tbats")
wkfl_fit_tbats <- workflows::workflow() %>%
add_recipe(recipe_spec) %>%
add_model(mdl_tbats_spec) %>%
fit(training(splits))
wkfl_fit_tbats== Workflow [trained] ==========================================================
Preprocessor: Recipe
Model: seasonal_reg()
-- Preprocessor ----------------------------------------------------------------
0 Recipe Steps
-- Model -----------------------------------------------------------------------
TBATS(1, {0,0}, -, {<7,2>})
Call: forecast::tbats(y = outcome, use.parallel = use.parallel)
Parameters
Alpha: 0.5932765
Gamma-1 Values: -0.003537627
Gamma-2 Values: 0.002531605
Seed States:
[,1]
[1,] 0.85064651
[2,] 0.11894200
[3,] -0.04456623
[4,] 0.06360484
[5,] -0.02010554
Sigma: 0.5713561
AIC: 1441.682
페이스북에서 개발한 Prophe 예측모형으로 적합시킨다.
mdl_prophet_spec <- prophet_reg(
) %>%
set_engine("prophet")
wkfl_fit_prophet <- workflows::workflow() %>%
add_recipe(recipe_spec) %>%
add_model(mdl_prophet_spec) %>%
fit(training(splits))
wkfl_fit_prophet== Workflow [trained] ==========================================================
Preprocessor: Recipe
Model: prophet_reg()
-- Preprocessor ----------------------------------------------------------------
0 Recipe Steps
-- Model -----------------------------------------------------------------------
PROPHET Model
- growth: 'linear'
- n.changepoints: 25
- changepoint.range: 0.8
- yearly.seasonality: 'auto'
- weekly.seasonality: 'auto'
- daily.seasonality: 'auto'
- seasonality.mode: 'additive'
- changepoint.prior.scale: 0.05
- seasonality.prior.scale: 10
- holidays.prior.scale: 10
- logistic_cap: NULL
- logistic_floor: NULL
- extra_regressors: 0
workflow 객체를 modeltime 팩키지 modeltime_table() 함수에 넣어 객체로 만든 후에, 추가로 지수평활법 EST 예측모형을 추가한다. modeltime_accuracy() 함수를 사용해서 모형 성능을 파악한다.
model_tbl <- modeltime_table(
wkfl_fit_arima,
wkfl_fit_ets,
wkfl_fit_tbats,
wkfl_fit_prophet
) %>%
update_model_description(.model_id = 1, "ARIMA") %>%
update_model_description(.model_id = 2, "ETS") %>%
update_model_description(.model_id = 3, "TBATS") %>%
update_model_description(.model_id = 4, "Prophet")
model_tbl %>%
modeltime::modeltime_accuracy(testing(splits))# A tibble: 4 x 9
.model_id .model_desc .type mae mape mase smape rmse rsq
<int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 ARIMA Test 0.320 4.71 2.64 4.85 0.387 0.775
2 2 ETS Test 0.410 6.02 3.37 6.26 0.484 NA
3 3 TBATS Test 0.432 6.34 3.56 6.62 0.508 0.0530
4 4 Prophet Test 1.28 19.2 10.6 21.3 1.31 0.478
시험데이터를 통해 모형으로 예측한 값을 시각화한다. ETS MAE 값이 ARIMA 예측모형과 별다른 차이가 나고 있지 않지만, 신뢰구간이 생긴 것은 그나마 다행이다.
calibration_tbl <- model_tbl %>%
modeltime_calibrate(
new_data = testing(splits)
)
calibration_tbl %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = history_tbl,
keep_data = TRUE,
conf_interval = 0.10
) %>%
mutate(
across(.cols = c(.value, 확진자), .fns = expm1) # log1p --> expm1 원척도로 변환
) %>%
plot_modeltime_forecast(
.legend_max_width = 60,
.legend_show = TRUE,
.conf_interval_show = TRUE,
.conf_interval_alpha = 0.20,
.conf_interval_fill = "lightblue",
.title = "코로나19 확진자 1개월 예측"
)앞선 모형은 history_tbl 을 훈련/시험 데이터에 대해 적합을 시킨 것이라 … 이제 시간을 확대하여 modeltime_refit() 함수를 사용해서 모형을 전체 데이터에 대해 다시 적합시킨다.
refit_tbl <- calibration_tbl %>%
modeltime_refit(data = history_tbl)마지막으로 앞서 구축된 모형을 바탕으로 현재까지 입수된 데이터를 바탕으로 한달 후를 예측한다.
refit_tbl %>%
modeltime_forecast(
new_data = forecast_tbl,
actual_data = history_tbl,
conf_interval = 0.3
) %>%
mutate(
across(.cols = c(.value), .fns = expm1) # log1p --> expm1 원척도로 변환
) %>%
plot_modeltime_forecast(
.legend_max_width = 25,
.conf_interval_fill = "lightblue",
.conf_interval_alpha = 0.7,
.interactive = TRUE
)마지막 단계로 데이터와 모형을 저장하여 다음 단계로 나가기 위한 작업을 수행한다.
corona_fcst_list <- list(
# 데이터 ---
data = full_tbl,
# 예측모형 ----
model = list(
wkfl_fit_lm = wkfl_fit_lm,
wkfl_fit_arima = wkfl_fit_arima,
wkfl_fit_ets = wkfl_fit_ets,
wkfl_fit_tbats = wkfl_fit_tbats,
wkfl_fit_prophet = wkfl_fit_prophet
)
)
# 모형 + 데이터 저장 ----
corona_fcst_list %>%
write_rds("data/corona_fcst_list.rds")데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com